In [1]:
import cPickle
import datetime
import glob
import os
import random
import re
import subprocess
import cdpybio as cpb
from ipyparallel import Client
import matplotlib.gridspec as gridspec
import matplotlib.pyplot as plt
import networkx as nx
import numpy as np
import pandas as pd
pd.options.mode.chained_assignment = None # default='warn'
import pybedtools as pbt
import scipy.stats as stats
import seaborn as sns
import statsmodels as sms
import statsmodels.api as sm
import statsmodels.formula.api
import statsmodels.formula.api as smf
import tabix
import vcf as pyvcf
import cardipspy as cpy
import ciepy
%matplotlib inline
dy_name = 'mcnv_analysis'
import socket
if socket.gethostname() == 'fl-hn1' or socket.gethostname() == 'fl-hn2':
dy = os.path.join(ciepy.root, 'sandbox', dy_name)
cpy.makedir(dy)
pbt.set_tempdir(dy)
outdir = os.path.join(ciepy.root, 'output', dy_name)
cpy.makedir(outdir)
private_outdir = os.path.join(ciepy.root, 'private_output', dy_name)
cpy.makedir(private_outdir)
In [2]:
fn = os.path.join(ciepy.root, 'output', 'input_data', 'rsem_tpm.tsv')
tpm = pd.read_table(fn, index_col=0)
fn = os.path.join(ciepy.root, 'output', 'input_data', 'rnaseq_metadata.tsv')
rna_meta = pd.read_table(fn, index_col=0)
fn = os.path.join(ciepy.root, 'output', 'input_data', 'subject_metadata.tsv')
subject_meta = pd.read_table(fn, index_col=0)
fn = os.path.join(ciepy.root, 'output', 'input_data', 'wgs_metadata.tsv')
wgs_meta = pd.read_table(fn, index_col=0)
gene_info = pd.read_table(cpy.gencode_gene_info, index_col=0)
gene_info['ensembl_id'] = [x.split('.')[0] for x in gene_info.index]
genes = pbt.BedTool(cpy.gencode_gene_bed)
dy = os.path.join(ciepy.root, 'output/eqtl_processing/eqtls01')
fn = os.path.join(dy, 'qvalues.tsv')
qvalues = pd.read_table(fn, index_col=0)
In [3]:
dy = os.path.join(ciepy.root, 'output/eqtl_processing/eqtls01')
fn = os.path.join(dy, 'gene_variant_pairs.tsv')
gene_variant = pd.read_table(fn, index_col=0)
In [4]:
fn = os.path.join(ciepy.root, 'output', 'cnv_processing', 'gs_combined_info.pickle')
gs_info = cPickle.load(open(fn))
fn = os.path.join(ciepy.root, 'private_output', 'cnv_processing', 'mcnvs.tsv')
mcnvs = pd.read_table(fn, index_col=0)
In [5]:
fn = os.path.join(ciepy.root, 'output', 'eqtl_input', 'variant_regions.bed')
variant_regions = pbt.BedTool(fn)
fn = os.path.join(ciepy.root, 'output', 'eqtl_input', 'tpm_log_filtered_phe_std_norm_peer_resid.tsv')
eqtl_tpm = pd.read_table(fn, index_col=0)
eqtl_tpm = eqtl_tpm[(gene_info.ix[eqtl_tpm.index, 'chrom'] != 'chrX') &
(gene_info.ix[eqtl_tpm.index, 'chrom'] != 'chrY') &
(gene_info.ix[eqtl_tpm.index, 'chrom'] != 'chrM')]
I want to test for an association between mCNVs and gene expression. I'm going to start by using unrelated individuals. There are genetically unrelated people in families (for instance the parents in a trio are unrelated) but for now I'll just take one person from each family.
In [6]:
fn = os.path.join(ciepy.root, 'output', 'eqtl_input', 'unrelateds.tsv')
mcnv_samples = pd.read_table(fn, index_col=0)
In [7]:
mcnv_samples_by_wgs = mcnv_samples.copy(deep=True)
mcnv_samples_by_wgs['rna_id'] = mcnv_samples_by_wgs.index
mcnv_samples_by_wgs.index = mcnv_samples_by_wgs.wgs_id
mcnv_samples_by_wgs = mcnv_samples_by_wgs.merge(subject_meta, left_on='subject_id', right_index=True)
In [8]:
mcnvs_f = mcnvs[mcnv_samples.wgs_id]
b = mcnvs_f.apply(lambda x: x.value_counts().max() < mcnvs_f.shape[1] - np.floor(mcnvs_f.shape[1] * 0.05), axis=1)
mcnvs_f = mcnvs_f[b]
In [9]:
t = gs_info.ix[mcnvs_f.index]
s = '\n'.join(t.chrom + '\t' + t.start.astype(str) + '\t' + t.end.astype(str) +
'\t' + t.name) + '\n'
mcnvs_bt = pbt.BedTool(s, from_string=True)
mcnvs_bt = mcnvs_bt.sort()
In [10]:
# Let's find out which mCNVs overlap the variant regions for the genes.
res = mcnvs_bt.intersect(variant_regions, sorted=True, wo=True)
df = res.to_dataframe()
df['gene'] = df.thickEnd.apply(lambda x: x.split('_')[0])
# I'll make a mapping from gene to mCNV.
gene_to_mcnv = pd.Series(df.name.values, index=df.gene)
In [11]:
eqtl_tpm_f = eqtl_tpm[mcnv_samples.wgs_id]
eqtl_tpm_f = cpb.general.transform_standard_normal(eqtl_tpm_f)
In [12]:
mcnvs_f.to_csv(os.path.join(private_outdir, 'filtered_mcnvs.tsv'), sep='\t')
In [13]:
def mcnv_data(gene):
cnvs = gene_to_mcnv[gene]
if type(cnvs) is pd.core.series.Series:
cnvs = list(cnvs)
else:
cnvs = [cnvs]
data = mcnvs_f.ix[cnvs].T
data['exp'] = eqtl_tpm_f.ix[gene, data.index].values
data['sex'] = (mcnv_samples_by_wgs.ix[data.index, 'sex'] == 'M').values
return data
def mcnv_regression(gene):
cnvs = gene_to_mcnv[gene]
if type(cnvs) is pd.core.series.Series:
cnvs = list(cnvs)
else:
cnvs = [cnvs]
data = mcnvs_f.ix[cnvs].T
data['exp'] = eqtl_tpm_f.ix[gene, data.index].values
data['sex'] = (mcnv_samples_by_wgs.ix[data.index, 'sex'] == 'M').values
data = mcnv_data(gene)
results = {}
for cnv in cnvs:
results[cnv] = statsmodels.formula.api.ols('exp ~ {} + sex'.format(cnv), data=data).fit()
return results
In [14]:
se = gene_to_mcnv.ix[set(gene_to_mcnv.index) & set(eqtl_tpm_f.index)]
print('{:,} total tests'.format(se.shape[0]))
print('{:,} distinct genes'.format(len(set(se.index))))
print('{:,} distinct mCNVs'.format(len(set(se.values))))
In [15]:
fn = os.path.join(outdir, 'results.pickle')
if not os.path.exists(fn):
ind = list(set(gene_to_mcnv.index) & set(eqtl_tpm_f.index))
gene_results = []
for g in ind:
gene_results.append(mcnv_regression(g))
with open(fn , 'w') as f:
cPickle.dump(gene_results, f)
else:
gene_results = cPickle.load(open(fn))
In [16]:
fn = os.path.join(outdir, 'reg_results.tsv')
if not os.path.exists(fn):
g = []
c = []
p = []
b = []
for i,d in enumerate(gene_results):
for k in d.keys():
g.append(ind[i])
c.append(k)
p.append(d[k].pvalues[k])
b.append(d[k].params[k])
reg_results = pd.DataFrame({'gene':g, 'cnv':c, 'pvalue':p, 'beta':b})
r = sms.sandbox.stats.multicomp.multipletests(reg_results.pvalue, method='fdr_bh')
reg_results['bh_pvalue'] = r[1]
reg_results['bh_sig'] = r[0]
reg_results = reg_results.merge(gs_info, left_on='cnv', right_index=True).drop(['name'], axis=1)
reg_results = reg_results.merge(gene_info, left_on='gene', right_index=True, suffixes=['', '_gene'])
reg_results['overlap_gene'] = (((reg_results.start > reg_results.start_gene) &
(reg_results.start < reg_results.end_gene)) |
((reg_results.end > reg_results.start_gene) &
(reg_results.end < reg_results.end_gene)))
a = reg_results.end - reg_results.start_gene
b = reg_results.start - reg_results.end_gene
t = pd.DataFrame([a,b]).T
ta = t.abs()
tv = t[ta.apply(lambda x: x == x.min(), axis=1)]
tv = tv.fillna(0).sum(axis=1)
reg_results['dist_to_gene'] = tv
reg_results.ix[reg_results.overlap_gene, 'dist_to_gene'] = 0
reg_results.to_csv(fn, sep='\t')
else:
reg_results = pd.read_table(fn, index_col=0)
sig = reg_results[reg_results.bh_sig]
sig.sort_values(by='pvalue', inplace=True)
In [17]:
a = len(set(sig.gene))
b = len(set(sig.gene) - set(sig.ix[sig.overlap_gene, 'gene']))
print('{} mCNV eGenes, {} of which are intergenic.'.format(a, b))
In [18]:
n = len(set(gene_variant.ix[gene_variant.variant_type == 'cnv', 'gene_id']) |
set(sig.gene))
print('{} genes have a significant CNV association considering both '
'biallelic and mCNVs.'.format(n))
In [19]:
reg_results.pvalue.hist()
plt.ylabel('Number of gene-CNV tests')
plt.xlabel('$p$-value');
In [20]:
fig,axs = plt.subplots(1, 2, figsize=(10, 4))
ax = axs[0]
sig.beta.hist(ax=ax)
ax.set_xticks([-1, -0.5, 0, 0.5, 1])
ax.set_ylabel('Number of significant hits')
ax.set_xlabel('Beta');
ax = axs[1]
sig.drop_duplicates(subset=['gene']).beta.hist(ax=axs[1])
ax.set_xticks([-1, -0.5, 0, 0.5, 1])
ax.set_ylabel('Number of significant genes')
ax.set_xlabel('Beta');
plt.tight_layout()
In [21]:
fig,axs = plt.subplots(1, 2, figsize=(10, 4))
ax = axs[0]
sig[sig.overlap_gene].beta.hist(ax=ax)
ax.set_xticks([-1, -0.5, 0, 0.5, 1])
ax.set_ylabel('Number of significant hits')
ax.set_xlabel('Beta');
ax = axs[1]
t = sig.drop_duplicates(subset=['gene'])
t[t.overlap_gene].beta.hist(ax=axs[1])
ax.set_xticks([-1, -0.5, 0, 0.5, 1])
ax.set_ylabel('Number of significant genes')
ax.set_xlabel('Beta');
plt.tight_layout()
In [22]:
sig.gene.value_counts().plot(kind='bar')
plt.xticks([]);
In [23]:
len(set(sig.gene))
Out[23]:
In [24]:
len(set(sig.ix[sig.overlap_gene, 'gene']))
Out[24]:
In [25]:
sig.overlap_gene.value_counts()
Out[25]:
In [26]:
fig,axs = plt.subplots(1, 2, figsize=(10, 4))
ax = axs[0]
(np.log10(sig.dist_to_gene.abs() + 1)).hist(ax=ax, bins=20)
ax.set_ylabel('Number of significant hits')
ax.set_xlabel('$log_{10}$ distance in bp');
ax = axs[1]
(np.log10(sig.drop_duplicates(subset=['gene']).dist_to_gene.abs() + 1)).hist(ax=axs[1], bins=20)
ax.set_ylabel('Number of significant genes')
ax.set_xlabel('$log_{10}$ distance in bp');
plt.tight_layout()
In [27]:
def plot_results(gene):
data = mcnv_data(gene)
s = sig[sig.gene == gene]
for c in s.cnv:
sns.lmplot(x=c, y='exp', data=data, x_jitter=0.2)
plt.ylabel(gene_info.ix[gene, 'gene_name'])
In [28]:
sig.shape
Out[28]:
In [29]:
plot_results(sig.gene.drop_duplicates().values[0])
In [30]:
plot_results(sig.gene.drop_duplicates().values[1])
In [31]:
sig_cnvs = set(sig.cnv)
not_sig_cnvs = set(reg_results.cnv) - sig_cnvs
a = gs_info.ix[sig_cnvs, 'nearest_tss_dist'].abs()
a = a[a != 0]
b = gs_info.ix[not_sig_cnvs, 'nearest_tss_dist'].abs()
b = b[b != 0]
s,p = stats.mannwhitneyu(a, b)
print(p)
In [32]:
fig,axs = plt.subplots(1, 2)
ax = axs[0]
np.log10(a.abs()).hist(ax=ax)
ax.set_title('Significant, median={:.1f}'.format(np.log10(a.abs()).median()))
ax = axs[1]
ax.set_title('Not significant, median={:.1f}'.format(np.log10(b.abs()).median()))
np.log10(b.abs()).hist(ax=ax);
In [33]:
sig_cnvs = set(sig.cnv) - set(sig.ix[sig.overlap_gene, 'cnv'])
not_sig_cnvs = set(reg_results.cnv) - sig_cnvs
a = gs_info.ix[sig_cnvs, 'nearest_tss_dist'].abs()
a = a[a != 0]
b = gs_info.ix[not_sig_cnvs, 'nearest_tss_dist'].abs()
b = b[b != 0]
s,p = stats.mannwhitneyu(a, b)
print(p)
In [34]:
fig,axs = plt.subplots(1, 2)
ax = axs[0]
np.log10(a.abs()).hist(ax=ax)
ax.set_title('Significant, median={:.1f}'.format(np.log10(a.abs()).median()))
ax = axs[1]
ax.set_title('Not significant, median={:.1f}'.format(np.log10(b.abs()).median()))
np.log10(b.abs()).hist(ax=ax);
In [35]:
sig.sort_values(by='overlap_gene', ascending=False).drop_duplicates(subset=['gene']).overlap_gene.value_counts()
Out[35]:
In [36]:
vc = sig.cnv.value_counts()
vc.head(15)
Out[36]:
In [37]:
c = mcnvs_f.ix[sorted(vc.head(13).index)].T.corr(method='spearman')
sns.heatmap(c)
Out[37]:
In [38]:
mcnvs_f_corr = mcnvs_f.T.corr(method='spearman')
sns.clustermap(mcnvs_f_corr, xticklabels=[], yticklabels=[]);
In [39]:
tt = pbt.BedTool('chr7\t143951166\t143953316\n', from_string=True)
In [40]:
res = genes.intersect(tt)
In [41]:
res.head()
In [42]:
tt = sig[sig.cnv == 'CNV_7_143951166_143953316']
data = []
for gene in tt.gene:
d = mcnv_data(gene)
d['gene'] = gene_info.ix[gene, 'gene_name']
data.append(d)
data = pd.concat(data)
data.to_csv(os.path.join(outdir, 'CNV_7_143951166_143953316_data.tsv'), sep='\t')
sns.lmplot(x='CNV_7_143951166_143953316', y='exp', data=data, hue='gene', x_jitter=0.2)
plt.ylabel('$\log$ TPM $z$-score')
plt.xlabel('Diploid copy number')
plt.savefig(os.path.join(outdir, 'CNV_7_143951166_143953316_example.pdf'))
I want to find QTLs where it appears that the CNV does not overlap the gene. At first, this looks true for many of the mCNV eQTLs. While I've already merged CNVs, I think that in some cases they need to be merged more. This should be apparent for genes that are associated with multiple mCNVs.
I'll explore this a bit here.
In [43]:
n = sum(sig.cnv.value_counts() > 2)
print('{} of {} ({:.2f}%) mCNV eQTLs are associated with more than two genes.'.format(
n, len(set(sig.cnv)), float(n) / len(set(sig.cnv)) * 100.))
In [44]:
vc = sig.gene.value_counts()
print('{} of {} mCNV eGenes are associated with more than one mCNV'.format(vc[vc > 1].shape[0], vc.shape[0]))
vc = vc[vc > 1]
In [45]:
overlap = []
for gene in vc.index:
t = sig[sig.gene == gene]
assert len(set(t.chrom)) == 1
start = gs_info.ix[t.cnv, 'start'].min()
end = gs_info.ix[t.cnv, 'end'].max()
overlap.append(not (gene_info.ix[gene, 'start'] > end or gene_info.ix[gene, 'end'] < start))
overlap = pd.Series(overlap, index=vc.index)
print('{} of these {} overlap the combined mCNV.'.format(overlap.sum(), vc.shape[0]))
In [46]:
sig['overlap_gene_cons'] = sig.gene.apply(lambda x: x in overlap[overlap].index) | sig.overlap_gene
In [47]:
t = sig[sig.gene.apply(lambda x: x not in overlap.index)]
se = t.overlap_gene
se.index = t.gene
overlap = pd.concat([overlap, se])
In [48]:
print('{} of {} overlap the gene after combining mCNVs.'.format(overlap.value_counts()[True], overlap.shape[0]))
In [49]:
t = sig[sig.gene.apply(lambda x: x in overlap[overlap == False].index)]
In [50]:
plt.scatter(t.dist_to_gene, t.length)
plt.xlabel('Distance to gene')
plt.ylabel('CNV size');
In [51]:
t.drop_duplicates('gene').length.hist()
Out[51]:
In [52]:
(t.drop_duplicates('gene').dist_to_gene > 10000).value_counts()
Out[52]:
In [53]:
(t.dist_to_gene > 10000).value_counts()
Out[53]:
In [54]:
t.dist_to_gene.hist()
Out[54]:
In [55]:
sig_intergenic_cnvs = set(sig.cnv) - set(sig.ix[sig.overlap_gene, 'cnv'])
not_sig_cnvs = set(reg_results.cnv) - set(sig.cnv)
In [56]:
len(sig_intergenic_cnvs)
Out[56]:
In [57]:
len(not_sig_cnvs)
Out[57]:
In [58]:
t = sig.sort_values(by=['overlap_gene_cons', 'pvalue']).drop_duplicates(subset='gene')
In [59]:
t.shape
Out[59]:
In [60]:
t.overlap_gene_cons.value_counts()
Out[60]:
In [61]:
sig.shape
Out[61]:
In [62]:
sum(sig.cnv.value_counts() > 2)
Out[62]:
In [63]:
t.cnv.value_counts()
Out[63]:
In [64]:
sig.to_csv(os.path.join(outdir, 'sig.tsv'), sep='\t')
In [ ]:
In [65]:
odds = []
pvalues = []
for c in roadmap_overlap.columns:
vc = (roadmap_overlap.ix[sig_intergenic_cnvs, c] > 0).value_counts()
if True not in vc.index:
sig_overlap = 0
else:
sig_overlap = vc[True]
sig_not_overlap = vc[False]
vc = (roadmap_overlap.ix[not_sig_cnvs, c] > 0).value_counts()
not_sig_overlap = vc[True]
not_sig_not_overlap = vc[False]
ctable = [[sig_overlap, sig_not_overlap],
[not_sig_overlap, not_sig_not_overlap]]
o,p = stats.fisher_exact(ctable)
odds.append(o)
pvalues.append(p)
roadmap_res = pd.DataFrame({'odds':odds, 'pvalue':pvalues}, index=roadmap_overlap.columns)
colors = pd.DataFrame(zip(list(set([x.split('_')[-1] for x in roadmap_res.index])),
cpb.analysis.tableau20[::2]), columns=['mark', 'color'])
roadmap_res['mark'] = [x.split('_')[-1] for x in roadmap_res.index]
roadmap_res['ind'] = roadmap_res.index
roadmap_res = roadmap_res.merge(colors)
roadmap_res.index = roadmap_res.ind
In [ ]:
plt.figure(figsize=(7, 11))
(np.log2(roadmap_res.odds)).plot(kind='barh', color=roadmap_res.color)
plt.xlabel('')
plt.tight_layout()
In [ ]:
plt.figure(figsize=(7, 11))
(-np.log10(roadmap_res.pvalue)).plot(kind='barh', color=roadmap_res.color)
plt.tight_layout()
In [ ]:
odds = []
pvalues = []
for c in encode_dnase_overlap.columns:
vc = (encode_dnase_overlap.ix[sig_intergenic_cnvs, c] > 0).value_counts()
sig_overlap = vc[True]
sig_not_overlap = vc[False]
vc = (encode_dnase_overlap.ix[not_sig_cnvs, c] > 0).value_counts()
not_sig_overlap = vc[True]
not_sig_not_overlap = vc[False]
ctable = [[sig_overlap, sig_not_overlap],
[not_sig_overlap, not_sig_not_overlap]]
o,p = stats.fisher_exact(ctable)
odds.append(o)
pvalues.append(p)
encode_dnase_res = pd.DataFrame({'odds':odds, 'pvalue':pvalues}, index=encode_dnase_overlap.columns)
In [ ]:
(-np.log10(encode_dnase_res.pvalue)).plot(kind='barh')
plt.xlabel('$-\log_{10}$ $p$-value')
plt.tight_layout()
In [ ]:
odds = []
pvalues = []
for c in encode_chip_overlap.columns:
vc = (encode_chip_overlap.ix[sig_intergenic_cnvs, c] > 0).value_counts()
if True not in vc.index:
sig_overlap = 0
else:
sig_overlap = vc[True]
sig_not_overlap = vc[False]
vc = (encode_chip_overlap.ix[not_sig_cnvs, c] > 0).value_counts()
if True not in vc.index:
not_sig_overlap = 0
else:
not_sig_overlap = vc[True]
not_sig_not_overlap = vc[False]
ctable = [[sig_overlap, sig_not_overlap],
[not_sig_overlap, not_sig_not_overlap]]
o,p = stats.fisher_exact(ctable)
odds.append(o)
pvalues.append(p)
encode_chip_res = pd.DataFrame({'odds':odds, 'pvalue':pvalues}, index=encode_chip_overlap.columns)
encode_chip_res.sort_values(by='pvalue', inplace=True, ascending=False)
In [ ]:
(-np.log10(encode_chip_res.pvalue.tail())).plot(kind='barh')
plt.tight_layout()
In [ ]:
promoters = pbt.BedTool('/publicdata/gencode_v19_20151104/promoters_by_gene.bed')
df = promoters.to_dataframe()
df['gene'] = df.name.apply(lambda x: x.split('_')[0])
df['region'] = df.name.apply(lambda x: x.split('_')[2])
df.index = df.region
gb = df.groupby('gene')
gene_to_promoters = gb.groups
In [ ]:
def get_hic_interactions(cnv, gene):
# CNV location
chrom,start,end = gs_info.ix[cnv, ['chrom', 'start', 'end']].values
r = '{}:{}-{}'.format(chrom, start, end)
fn = ('/projects/CARDIPS/pipeline/Hi-C/7_indv_HiC/contact_matrices/'
'merged/normalized/iPSC/iPSC.5Kb.nor.{}.bed.gz'.format(chrom))
t = tabix.open(fn)
# Get lines for CNV.
lines = []
res = t.querys(r)
while True:
try:
lines.append(res.next())
except StopIteration:
break
# Get promoter regions.
ps = [cpb.general.parse_region(x) for x in gene_to_promoters[gene]]
# Get indices of columns for promoter regions.
cols = []
for x in ps:
cols += range(int(x[1]) / 5000, int(x[2]) / 5000 + 1)
cols = sorted(list(set(cols)))
vals = []
# Get values for promoter regions.
for line in lines:
vals.append([float(line[x + 3]) for x in cols])
df = pd.DataFrame(vals)
df.columns = np.array(cols) * 5000
df.index = np.arange((start / 5000) * 5000, (end / 5000 + 1) * 5000, 5000)
# Reflect over promoter regions and get interactions on other side.
cnv_middle = int(start + (end - start) / 2.)
promoter_middle = int(df.columns[0] + (df.columns[-1] - df.columns[0]) / 2.)
null_start = int(2 * promoter_middle - cnv_middle - (end - start) / 2.)
null_end = int(2 * promoter_middle - cnv_middle + (end - start) / 2.)
if null_start < 0 or null_end < 0:
return None, None
else:
# Get lines for CNV.
r = '{}:{}-{}'.format(chrom, null_start, null_end)
lines = []
res = t.querys(r)
while True:
try:
lines.append(res.next())
except StopIteration:
break
vals = []
# Get values for promoter regions.
for line in lines:
vals.append([float(line[x + 3]) for x in cols])
null_df = pd.DataFrame(vals)
null_df.columns = np.array(cols) * 5000
null_df.index = np.arange((null_start / 5000) * 5000, (null_end / 5000 + 1) * 5000, 5000)
return df, null_df
In [ ]:
t = sig[sig.cnv.apply(lambda x: x in sig_intergenic_cnvs)]
In [ ]:
real = []
null = []
dist = []
ind = []
for i in t.index:
df, null_df = get_hic_interactions(t.ix[i, 'cnv'], t.ix[i, 'gene'])
if df is not None:
real.append(df.max().max())
null.append(null_df.max().max())
dist.append(abs(np.mean(df.columns) - np.mean(df.index)))
ind.append(i)
real = pd.Series(real)
null = pd.Series(null)
In [ ]:
hic = pd.DataFrame({'real':real.values, 'null':null.values, 'dist':dist}, index=ind)
In [ ]:
plt.scatter(hic.null * 10000, hic.real * 10000)
In [ ]:
hic[hic.real * 10000 > 3]
In [ ]:
plot_results(t.ix[5764, 'gene'])
In [ ]:
tdf = hic[hic.null == 0]
In [ ]:
tdf.sort_values(by='real', ascending=False, inplace=True)
In [ ]:
tdf.head()
In [ ]:
plot_results(t.ix[4537, 'gene'])
In [ ]: